TP1 EEA #Añadiendo las librerías necesarias
rm(list=ls())
library("dplyr")
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
library("tidyverse")
[37m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --[39m
[37m[32mv[37m [34mggplot2[37m 3.2.1 [32mv[37m [34mreadr [37m 1.3.1
[32mv[37m [34mtibble [37m 2.1.3 [32mv[37m [34mpurrr [37m 0.3.2
[32mv[37m [34mtidyr [37m 1.0.0 [32mv[37m [34mstringr[37m 1.2.0
[32mv[37m [34mggplot2[37m 3.2.1 [32mv[37m [34mforcats[37m 0.4.0[39m
[37m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
#1.Preparacion de los datos (I) #a.Leer el archivo ar_properties.csv y mostrar su estructura Leyendo el archivo usando read.table Luego usando Glipse para dar un vistazo a la DB t0, t1 y tcorridaCSV serán usados para medir el tiempo de lectura del archivo
t0 <- Sys.time()
ar_properties <- read.table("ar_properties.csv",
sep=",",
dec=".",
header = TRUE,
fill = TRUE)
EOF within quoted string
t1 <- Sys.time()
tcorridaCSV <- as.numeric( t1 - t0, units = "secs")
glimpse(ar_properties)
Observations: 143,852
Variables: 24
$ id [3m[38;5;246m<fct>[39m[23m S0we3z3V2JpHUJreqQ2t/w==, kMxcmAS8NvrynGBVbMOEaQ==, Ce3ojF+ZTOkB8...
$ ad_type [3m[38;5;246m<fct>[39m[23m Propiedad, Propiedad, Propiedad, Propiedad, Propiedad, Propiedad,...
$ start_date [3m[38;5;246m<fct>[39m[23m 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-14, 2019-...
$ end_date [3m[38;5;246m<fct>[39m[23m 2019-06-14, 2019-04-16, 9999-12-31, 9999-12-31, 2019-07-09, 2019-...
$ created_on [3m[38;5;246m<fct>[39m[23m 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-14, 2019-...
$ lat [3m[38;5;246m<fct>[39m[23m -34.9433118208, -34.63181, NA, -34.65470505, -34.65494919, -32.93...
$ lon [3m[38;5;246m<fct>[39m[23m -54.9296557586, -58.420599, NA, -58.79089355, -58.787117, -60.683...
$ l1 [3m[38;5;246m<fct>[39m[23m Uruguay, Argentina, Argentina, Argentina, Argentina, Argentina, A...
$ l2 [3m[38;5;246m<fct>[39m[23m Maldonado, Capital Federal, Bs.As. G.B.A. Zona Norte, Bs.As. G.B....
$ l3 [3m[38;5;246m<fct>[39m[23m Punta del Este, Boedo, NA, Moreno, Moreno, Rosario, Ituzaingó, J...
$ l4 [3m[38;5;246m<fct>[39m[23m NA, NA, NA, Moreno, Moreno, NA, Ituzaingó, NA, NA, NA, NA, NA, N...
$ l5 [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ l6 [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ rooms [3m[38;5;246m<fct>[39m[23m 2, NA, 2, 2, 2, 4, NA, 6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ bedrooms [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ bathrooms [3m[38;5;246m<fct>[39m[23m 1, NA, 1, 2, 3, 1, 3, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ surface_total [3m[38;5;246m<fct>[39m[23m 45, NA, 200, 460, 660, NA, 70, NA, 1300, 405, 352, 373, 360, 1325...
$ surface_covered [3m[38;5;246m<fct>[39m[23m 40, NA, NA, 100, 148, 89, 122, NA, NA, NA, NA, NA, NA, 2, NA, NA,...
$ price [3m[38;5;246m<fct>[39m[23m 13000, 0, NA, NA, NA, NA, NA, NA, 0, NA, 0, NA, NA, NA, NA, NA, 0...
$ currency [3m[38;5;246m<fct>[39m[23m UYU, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ price_period [3m[38;5;246m<fct>[39m[23m Mensual, Mensual, NA, Mensual, Mensual, Mensual, Mensual, Mensual...
$ title [3m[38;5;246m<fct>[39m[23m Departamento - Roosevelt, PH - Boedo, Ituzaingo 1100 - $ 1 - Cas...
$ property_type [3m[38;5;246m<fct>[39m[23m Departamento, PH, Casa, Casa, Casa, Casa, Casa, Casa, Lote, Lote,...
$ operation_type [3m[38;5;246m<fct>[39m[23m Alquiler, Venta, Alquiler, Venta, Venta, Venta, Venta, Alquiler, ...
#b.Quedarse con aquellos registros que: # i.Pertenecen a Argentina y Capital Federal # ii.Cuyo precio esta en dolares (USD) # iii.El tipo de propiedad sea: Departamento, PH o Casa # iv.El tipo de operacion sea Venta #Revisé la variable Currency para ver si todas estaban marcadas por USD para dolaers #Revisé los NA en country l1 #Deperatamento 42041 #PH 4564 #Casa 21535 #Sólo por el filtro de “Capital Federal” tenemos 47577
ar_properties_filtrado <- ar_properties %>% filter(l1 == "Argentina", l2 == "Capital Federal",
currency == "USD" ,
property_type %in% c("Casa","PH", "Departamento"),
operation_type == "Venta")
glimpse(ar_properties_filtrado)
Observations: 24,323
Variables: 24
$ id [3m[38;5;246m<fct>[39m[23m oyj+f764ALCYodIqBvWAww==, HdjpKrqdwYfH9YU1DKjltg==, YwWE3rTb2+gms...
$ ad_type [3m[38;5;246m<fct>[39m[23m Propiedad, Propiedad, Propiedad, Propiedad, Propiedad, Propiedad,...
$ start_date [3m[38;5;246m<fct>[39m[23m 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-14, 2019-...
$ end_date [3m[38;5;246m<fct>[39m[23m 2019-07-10, 2019-04-15, 2019-06-30, 9999-12-31, 2019-05-21, 9999-...
$ created_on [3m[38;5;246m<fct>[39m[23m 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-14, 2019-04-14, 2019-...
$ lat [3m[38;5;246m<fct>[39m[23m -34.6522498, -34.6282483, -34.5927955, -34.56563187, -34.62217712...
$ lon [3m[38;5;246m<fct>[39m[23m -58.385565, -58.4065245, -58.4209298, -58.46513367, -58.52272415,...
$ l1 [3m[38;5;246m<fct>[39m[23m Argentina, Argentina, Argentina, Argentina, Argentina, Argentina,...
$ l2 [3m[38;5;246m<fct>[39m[23m Capital Federal, Capital Federal, Capital Federal, Capital Federa...
$ l3 [3m[38;5;246m<fct>[39m[23m Barracas, Boedo, Palermo, Belgrano, Versalles, Velez Sarsfield, N...
$ l4 [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ l5 [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ l6 [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ rooms [3m[38;5;246m<fct>[39m[23m NA, 6, NA, 3, NA, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,...
$ bedrooms [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ bathrooms [3m[38;5;246m<fct>[39m[23m NA, 2, 2, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
$ surface_total [3m[38;5;246m<fct>[39m[23m 300, 178, 240, 157, 140, 95, 44, 40, 49, 40, 40, 40, 49, 40, 23, ...
$ surface_covered [3m[38;5;246m<fct>[39m[23m 180, 240, 157, NA, 110, 69, 38, 37, 44, 37, 37, 37, 44, 37, 23, 3...
$ price [3m[38;5;246m<fct>[39m[23m 320000, 500000, 350000, 470000, 155000, 199900, 147000, 92294, 11...
$ currency [3m[38;5;246m<fct>[39m[23m USD, USD, USD, USD, USD, USD, USD, USD, USD, USD, USD, USD, USD, ...
$ price_period [3m[38;5;246m<fct>[39m[23m Mensual, Mensual, Mensual, NA, NA, NA, Mensual, Mensual, Mensual,...
$ title [3m[38;5;246m<fct>[39m[23m "PH EN VENTA", "Casa - San Telmo", "CASA EN VENTA", "Mendoza 320...
$ property_type [3m[38;5;246m<fct>[39m[23m PH, Casa, Casa, Casa, Casa, Casa, Departamento, Departamento, Dep...
$ operation_type [3m[38;5;246m<fct>[39m[23m Venta, Venta, Venta, Venta, Venta, Venta, Venta, Venta, Venta, Ve...
#c.Seleccionar las variables id, l3, rooms, bedrooms, bathrooms, surface_total, surface_covered, price y property_type
ar_properties_filtrado <- ar_properties_filtrado %>%
select(id, l3, rooms, bedrooms, bathrooms, surface_total, surface_covered, price, property_type)
glimpse(ar_properties_filtrado)
Observations: 24,323
Variables: 9
$ id [3m[38;5;246m<fct>[39m[23m oyj+f764ALCYodIqBvWAww==, HdjpKrqdwYfH9YU1DKjltg==, YwWE3rTb2+gms...
$ l3 [3m[38;5;246m<fct>[39m[23m Barracas, Boedo, Palermo, Belgrano, Versalles, Velez Sarsfield, N...
$ rooms [3m[38;5;246m<fct>[39m[23m NA, 6, NA, 3, NA, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,...
$ bedrooms [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ bathrooms [3m[38;5;246m<fct>[39m[23m NA, 2, 2, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
$ surface_total [3m[38;5;246m<fct>[39m[23m 300, 178, 240, 157, 140, 95, 44, 40, 49, 40, 40, 40, 49, 40, 23, ...
$ surface_covered [3m[38;5;246m<fct>[39m[23m 180, 240, 157, NA, 110, 69, 38, 37, 44, 37, 37, 37, 44, 37, 23, 3...
$ price [3m[38;5;246m<fct>[39m[23m 320000, 500000, 350000, 470000, 155000, 199900, 147000, 92294, 11...
$ property_type [3m[38;5;246m<fct>[39m[23m PH, Casa, Casa, Casa, Casa, Casa, Departamento, Departamento, Dep...
#2.Analisis exploratorios (I) #a.Obtener la cantidad de valores unicos y de valores faltantes (NAs) para cada una de estas variables #Para esto usaremos la función ‘unique’
unique(ar_properties_filtrado)
#Pero con apply la podemos aplicar a todo el dataFrame sin duplicar código
uniqueValues <- apply(ar_properties_filtrado,2,unique)
#Tenemos como resultado:
uniqueValues$l3
[1] "Barracas" "Boedo" "Palermo"
[4] "Belgrano" "Versalles" "Velez Sarsfield"
[7] "Nuñez" "Almagro" "Caballito"
[10] "Catalinas" "San Telmo" "Villa Crespo"
[13] "Puerto Madero" "Villa Urquiza" "Parque Chacabuco"
[16] "Retiro" "Floresta" "Recoleta"
[19] "Saavedra" "Balvanera" "Colegiales"
[22] "Parque Chas" "Barrio Norte" "Villa Devoto"
[25] "Villa Ortuzar" "Villa Pueyrredón" "Paternal"
[28] "Villa Real" "Once" "Flores"
[31] "Las Cañitas" "Villa Santa Rita" "Centro / Microcentro"
[34] "Villa del Parque" "Parque Centenario" "Congreso"
[37] "Parque Avellaneda" "Chacarita" "Abasto"
[40] "San Cristobal" "Boca" "Liniers"
[43] "Villa General Mitre" "AgronomÃa" "Parque Patricios"
[46] "Coghlan" "Monserrat" "San Nicolás"
[49] "Villa Lugano" NA "Constitución"
[52] "Mataderos" "Monte Castro" "Villa Luro"
[55] "Pompeya" "Tribunales" "Villa Soldati"
[58] "Villa Riachuelo"
#l3 -57 Barrios entontrados y NA
uniqueValues$rooms
[1] NA "6" "3" "1" "2" "5" "4" "7" "8" "9" "10" "12" "11" "15" "26" "32" "17" "20"
#rooms 17 categorías (y NA) sin embargo se tienen valores que no tienen mucho sentido
#Algunos de ellos parecieran ser excesivos
uniqueValues$bedrooms
[1] NA "0" "1" "2" "3" "4" "5" "7" "6" "8" "16" "9" "25" "130" "10"
[16] "13" "11"
#bedrooms 17 categorías (y NA) se tiene un outlier de 130 que debería ser un error de tipeo
#dado que en rooms no tenemos ningún outlier que se le acerque
uniqueValues$bathrooms
[1] NA "2" "4" "1" "3" "5" "7" "6" "10" "9" "8" "13" "14"
#bathrooms 12 categorías (y NA) los valores más grandes son llamativos
uniqueValues$surface_total
[1] "300" "178" "240" "157" "140" "95" "44" "40" "49"
[10] "23" "32" "36" "90" "45" "54" "33" "59" "55"
[19] "127" "38" "174" "75" "70" "31" "48" "53" "39"
[28] "58" "42" "52" "64" "50" "83" "41" "29" "69"
[37] "43" "84" "62" "63" "60" "100" "170" "5821" "96"
[46] "187" "110" "68" "78" "76" "113" "92" "67" "97"
[55] "150" "197" "80" "184" "133" "108" "91" "77" "190"
[64] "85" "180" NA "94" "128" "277" "111" "202" "317"
[73] "640" "1309" "503" "320" "360" "125" "120" "26" "81"
[82] "56" "220" "65" "28" "25" "35" "160" "27" "115"
[91] "66" "37" "103" "79" "47" "34" "82" "87" "72"
[100] "112" "200" "230" "74" "71" "89" "143" "102" "165"
[109] "166" "280" "710" "411" "185" "223" "273" "126" "99"
[118] "260" "98" "154" "213" "136" "158" "270" "144" "420"
[127] "210" "462" "175" "612" "269" "142" "219" "105" "107"
[136] "135" "244" "322" "225" "374" "109" "233" "30" "22"
[145] "106" "440" "24" "148" "20" "86" "73" "61" "130"
[154] "88" "46" "173" "370" "276" "207" "51" "146" "131"
[163] "93" "57" "312" "457" "21" "129" "242" "363" "114"
[172] "104" "101" "134" "182" "198" "250" "340" "118" "161"
[181] "217" "179" "124" "298" "425" "371" "562" "212" "196"
[190] "286" "400" "159" "355" "430" "284" "315" "330" "336"
[199] "386" "350" "384" "177" "123" "141" "119" "183" "138"
[208] "155" "239" "122" "195" "338" "238" "249" "335" "117"
[217] "394" "304" "214" "241" "215" "145" "253" "116" "402"
[226] "235" "407" "194" "224" "264" "409" "216" "508" "601"
[235] "139" "523" "405" "431" "132" "171" "246" "248" "271"
[244] "19" "162" "152" "258" "10145" "228" "203" "164" "181"
[253] "121" "337" "186" "188" "156" "192" "147" "500" "231"
[262] "255" "380" "168" "153" "193" "331" "172" "149" "550"
[271] "278" "532" "352" "167" "347" "390" "189" "436" "910"
[280] "279" "287" "15" "191" "199" "2948" "169" "262" "288"
[289] "259" "245" "290" "533" "204" "658" "305" "222" "289"
[298] "205" "254" "275" "765" "445" "265" "332" "388" "306"
[307] "364" "359" "16" "399" "163" "308" "327" "395" "252"
[316] "236" "303" "345" "464" "495" "600" "14" "3935" "309"
[325] "211" "201" "243" "232" "299" "297" "510" "580" "392"
[334] "1216" "650" "540" "285" "385" "17" "137" "3608" "637"
[343] "616" "11250" "302" "634" "375" "1000" "950" "151" "251"
[352] "206" "624" "1112" "413" "226" "227" "263" "176" "437"
[361] "406" "422" "719" "598" "610" "261" "291" "307" "377"
[370] "234" "750" "310" "325" "281" "268" "256" "535" "880"
[379] "450" "745" "1050" "293" "294" "372" "296" "339" "401"
[388] "567" "570" "221" "5647" "486" "343" "492" "348" "283"
[397] "444" "313" "850" "737" "301" "208" "361" "522" "326"
[406] "397" "257" "362" "677" "483" "328" "455" "356" "351"
[415] "490" "945" "18" "274" "229" "489" "342" "602" "267"
[424] "470" "0" "475" "237" "473" "820" "282" "465" "2000"
[433] "292" "314" "410" "382" "126062" "516" "858" "487" "481"
[442] "5322" "453" "560" "781" "329" "379" "730" "318" "780"
[451] "460" "354" "424" "515" "847" "800" "536" "502" "266"
[460] "295" "341" "6085" "272" "524" "649" "324" "700" "334"
[469] "632" "358" "1200" "16725" "247" "12" "529" "609" "525"
[478] "451" "446" "995" "323" "568" "494" "537" "6905" "576"
[487] "396" "209" "311" "654" "346" "9053" "378" "615" "480"
[496] "621" "417" "1060" "13" "593" "387" "6845" "376" "726"
[505] "672" "218" "518" "447" "630" "365" "591" "506" "853"
[514] "554" "389" "520" "373" "590" "715" "426" "521" "383"
[523] "534" "783" "435" "545" "962" "1400" "900" "6600" "2773"
[532] "4534" "526" "391" "923" "367" "736" "398" "4500"
uniqueValues$surface_covered
[1] "180" "240" "157" NA "110" "69" "38" "37" "44"
[10] "23" "30" "34" "33" "90" "35" "48" "40" "32"
[19] "41" "36" "55" "127" "174" "29" "68" "70" "28"
[28] "50" "58" "60" "54" "39" "65" "47" "31" "49"
[37] "57" "45" "100" "73" "120" "53" "96" "91" "187"
[46] "77" "56" "63" "66" "62" "46" "103" "101" "80"
[55] "72" "112" "93" "74" "178" "133" "87" "129" "85"
[64] "81" "171" "145" "76" "94" "234" "75" "95" "170"
[73] "123" "288" "420" "1000" "192" "450" "300" "128" "26"
[82] "220" "25" "59" "148" "27" "52" "67" "105" "4"
[91] "43" "150" "230" "84" "89" "83" "135" "61" "78"
[100] "140" "102" "265" "185" "200" "160" "107" "130" "115"
[109] "92" "244" "152" "213" "64" "225" "121" "158" "175"
[118] "182" "250" "395" "210" "328" "163" "1" "286" "688"
[127] "264" "143" "214" "97" "149" "104" "189" "233" "22"
[136] "51" "99" "406" "21" "17" "71" "42" "79" "108"
[145] "125" "380" "146" "98" "249" "165" "114" "18" "24"
[154] "147" "19" "86" "154" "172" "137" "162" "118" "136"
[163] "119" "139" "195" "173" "266" "310" "144" "196" "263"
[172] "205" "155" "372" "169" "315" "142" "190" "255" "330"
[181] "138" "386" "350" "126" "82" "295" "177" "166" "106"
[190] "111" "124" "159" "132" "277" "280" "238" "117" "394"
[199] "270" "151" "304" "134" "227" "232" "184" "251" "374"
[208] "194" "275" "259" "409" "340" "260" "355" "400" "88"
[217] "453" "161" "331" "226" "109" "20" "390" "203" "10145"
[226] "211" "186" "188" "156" "141" "500" "215" "122" "191"
[235] "164" "116" "198" "193" "357" "312" "113" "332" "183"
[244] "342" "181" "550" "294" "217" "287" "15" "199" "2948"
[253] "2667" "131" "291" "271" "285" "153" "540" "345" "248"
[262] "418" "257" "16" "282" "242" "218" "258" "212" "228"
[271] "14" "3935" "176" "352" "231" "279" "432" "338" "946"
[280] "650" "385" "290" "219" "576" "252" "8830" "341" "700"
[289] "206" "167" "491" "750" "413" "302" "216" "437" "422"
[298] "425" "320" "408" "335" "600" "322" "370" "360" "179"
[307] "12" "197" "276" "325" "267" "268" "880" "297" "209"
[316] "745" "870" "293" "246" "202" "296" "313" "401" "306"
[325] "567" "388" "8" "207" "435" "245" "281" "3" "444"
[334] "440" "10" "460" "508" "239" "208" "308" "261" "262"
[343] "236" "243" "362" "568" "483" "168" "356" "235" "339"
[352] "344" "201" "317" "845" "274" "278" "421" "224" "820"
[361] "1400" "126062" "560" "6770" "448" "5322" "414" "222" "412"
[370] "237" "530" "229" "354" "319" "324" "378" "301" "391"
[379] "800" "375" "506" "273" "2708" "3308" "5821" "8080" "377"
[388] "3506" "272" "397" "347" "570" "379" "371" "269" "303"
[397] "307" "1200" "247" "254" "256" "457" "478" "438" "309"
[406] "519" "323" "298" "532" "8050" "559" "486" "580" "221"
[415] "204" "253" "623" "327" "424" "980" "314" "472" "5930"
[424] "6756" "376" "442" "475" "292" "652" "5" "7" "284"
[433] "366" "436" "326" "353" "630" "336" "430" "334" "455"
[442] "715" "426" "318" "361" "383" "321" "363" "384" "470"
[451] "902" "850" "1978" "590" "431" "2773" "1033" "867" "526"
[460] "410" "641"
#surface_total y surface_covered alta variabilidad, dado que es una variable numérica,
#se podría transformar en categórica revisando más a fondo los valores
summary(as.integer(uniqueValues$surface_total))
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 145.2 279.5 749.6 450.8 126062.0 1
summary(as.integer(uniqueValues$surface_covered))
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
1.0 120.8 236.5 719.7 374.2 126062.0 1
#Price
uniqueValues$price
[1] "320000" "500000" "350000" "470000" "155000" "199900" "147000" "92294"
[9] "115000" "77000" "88900" "88798" "110975" "92943" "69000" "99000"
[17] "96984" "125000" "99500" "121600" "285000" "150000" "140000" "110000"
[25] "96000" "248200" "152732" "310000" "108000" "94800" "1080000" "88000"
[33] "225000" "85000" "129900" "185000" "158000" "98000" "80000" "159900"
[41] "87000" "178000" "172000" "208000" "83000" "134000" "135000" "105000"
[49] "165000" "90000" "104500" "119000" "192000" "390000" "169000" "86000"
[57] "79000" "315000" "230000" "120000" "259000" "180000" "269900" "190000"
[65] "175000" "130000" "209000" "270000" "340000" "194500" "255000" "139000"
[73] "439000" "729000" "128000" "101000" "92000" "245000" "214900" "200000"
[81] "385000" "370000" "220000" "189000" "239000" "260000" "252000" "290000"
[89] "790000" "595000" "280000" "195000" "613567" "215000" "210000" "580000"
[97] "224000" "850000" "680000" "1115000" "900000" "250000" "199000" "249000"
[105] "94000" "95000" "70000" "65000" "795000" "325000" "182000" "183000"
[113] "380000" "770000" "68000" "84000" "160000" "100700" "102000" "168000"
[121] "855000" "865000" "485000" "780000" "75000" "106000" "142000" "235000"
[129] "510000" "103000" "98500" "104000" "187300" "123900" "128400" "85600"
[137] "80400" "193000" "195200" "126400" "92400" "191200" "136500" "305000"
[145] "720000" "107000" "74000" "54900" "109000" "145000" "143000" "116000"
[153] "204000" "100000" "239800" "93000" "205000" "138000" "112000" "360000"
[161] "124000" "170000" "84102" "127000" "111000" "173000" "217000" "152000"
[169] "149000" "286000" "995000" "275000" "206700" "490000" "268000" "295000"
[177] "460000" "650000" "980000" "440000" "384000" "450000" "1700000" "397000"
[185] "550000" "620000" "184000" "1850000" "1650000" "375000" "540000" "1300000"
[193] "1970000" "1250000" "399000" "628000" "1150000" "1350000" "1600000" "297000"
[201] "278000" "332000" "133000" "479000" "365000" "355000" "240000" "649000"
[209] "420000" "236000" "264000" "300000" "749000" "84500" "69500" "97800"
[217] "415000" "73000" "82000" "132000" "138500" "419000" "1200000" "64900"
[225] "445000" "359500" "239500" "206000" "282000" "118000" "89000" "198000"
[233] "129000" "78000" "299000" "111900" "192900" "64000" "318000" "174000"
[241] "157354" "108286" "99900" "219000" "42000" "329000" "895000" "149500"
[249] "157800" "137000" "520000" "1100000" "144900" "398000" "124999" "157500"
[257] "117000" "378000" "191911" "289000" "37000" "345000" "518000" "83500"
[265] "395000" "430000" "269000" "49000" "114000" "99800" "114500" "90500"
[273] "128500" "241500" "62000" "203788" "211011" "194400" "45000" "95266"
[281] "85178" "93989" "95832" "88372" "93374" "83370" "92479" "87341"
[289] "98428" "91602" "98930" "90040" "90766" "91707" "99518" "84274"
[297] "94192" "89054" "101360" "86106" "89770" "97098" "102214" "100762"
[305] "94474" "97675" "85037" "90857" "92146" "86705" "95904" "97192"
[313] "100321" "85629" "83916" "104106" "81703" "95809" "96535" "94642"
[321] "141000" "159000" "55000" "103300" "89900" "135300" "284000" "97000"
[329] "170052" "162400" "330000" "123000" "84900" "368000" "314200" "410000"
[337] "144500" "177000" "302797" "276404" "288569" "277564" "97500" "244290"
[345] "162505" "248081" "203000" "67500" "88500" "212500" "159500" "79900"
[353] "670000" "298000" "572000" "404000" "126900" "239999" "1000000" "232000"
[361] "480000" "364000" "400000" "186000" "353000" "335000" "307000" "265000"
[369] "342000" "218000" "227000" "228000" "279900" "242000" "149600" "169900"
[377] "254000" "153000" "235500" "52000" "176000" "179000" "189900" "575000"
[385] "249999" "154900" "744000" "886000" "349000" "449900" "387000" "590000"
[393] "549000" "212000" "304500" "495000" "920000" "3500000" "515000" "750000"
[401] "3400000" "690000" "640000" "698000" "880000" "565000" "598000" "144000"
[409] "156000" "194900" "95500" "237000" "156960" "294000" "162000" "72000"
[417] "160125" "59900" "95200" "109500" "119500" "113000" "35000" "475000"
[425] "125900" "173583" "99100" "122500" "372000" "168900" "226000" "630000"
[433] "163000" "350550" "279000" "493000" "449000" "309000" "57900" "870000"
[441] "563000" "194000" "319000" "465000" "615000" "229000" "2550000" "3700000"
[449] "1480000" "940000" "860000" "359100" "448000" "343000" "719000" "444000"
[457] "828000" "455000" "267000" "524000" "148000" "435000" "2900000" "950000"
[465] "4500000" "675000" "530000" "560000" "47000" "425000" "93500" "58174"
[473] "75542" "98280" "95280" "730000" "187000" "570000" "122000" "157000"
[481] "145600" "149900" "825000" "1140000" "1400000" "1050000" "625000" "164000"
[489] "60000" "890000" "427696" "712000" "85720" "133849" "59000" "106800"
[497] "119700" "116800" "118600" "114800" "102700" "113800" "76000" "313000"
[505] "74500" "745900" "84501" "120150" "197860" "190250" "115700" "119900"
[513] "185080" "103320" "50000" "89999" "73900" "186600" "133500" "105100"
[521] "597000" "158900" "136422" "803648" "627880" "129294" "2169696" "69900"
[529] "127800" "60500" "64800" "105948" "99573" "148300" "106948" "133900"
[537] "156890" "76900" "164198" "248000" "635000" "229900" "164600" "147500"
[545] "199500" "202375" "233000" "229500" "367000" "158500" "158250" "217500"
[553] "177600" "279500" "166000" "39500" "37500" "216000" "289100" "244900"
[561] "4000000" "224900" "287000" "369000" "897000" "1500000" "1550000" "1800000"
[569] "1380000" "223000" "4800000" "2800000" "351000" "845000" "660000" "699000"
[577] "1680000" "188000" "87749" "257000" "189500" "202566" "198432" "210834"
[585] "215488" "214968" "45900" "166600" "90800" "81000" "2345678" "124500"
[593] "167000" "316484" "262000" "126825" "139500" "131100" "110600" "1234567"
[601] "121000" "74900" "56000" "169500" "134800" "109900" "61000" "80047"
[609] "77553" "77500" "89800" "346000" "1030000" "559000" "591361" "558571"
[617] "68500" "75500" "65500" "72500" "85500" "79500" "64500" "71000"
[625] "89500" "66500" "73500" "43000" "63000" "76500" "58500" "82500"
[633] "62500" "80500" "92500" "56500" "59500" "70500" "91650" "101050"
[641] "725000" "104436" "115788" "64212" "118463" "44592" "69258" "67241"
[649] "92818" "90804" "47294" "61993" "134725" "156188" "55517" "58237"
[657] "49135" "136000" "66000" "112400" "138600" "153500" "201500" "66900"
[665] "83800" "133575" "147275" "193500" "122900" "86130" "101039" "181140"
[673] "148200" "109800" "102573" "93600" "104948" "64950" "990000" "1390000"
[681] "1340000" "184300" "272000" "174900" "119990" "389000" "496000" "228542"
[689] "387090" "359000" "292000" "249500" "238000" "452000" "1360000" "1450000"
[697] "260850" "459000" "545000" "519000" "3000000" "1099000" "296000" "128900"
[705] "535000" "80711" "79120" "371000" "81900" "32000" "102526" "102481"
[713] "104311" "112587" "87500" "39000" "104900" "67000" "179500" "126000"
[721] "442650" "476700" "499800" "284997" "374000" "695000" "408400" "196000"
[729] "314000" "151000" "882700" "781820" "819650" "807040" "794430" "197000"
[737] "258000" "321000" "1750000" "338000" "910000" "318200" "738400" "875000"
[745] "555000" "423300" "91500" "998000" "428000" "243000" "154000" "389500"
[753] "68100" "108500" "129500" "73600" "108700" "79200" "60200" "106600"
[761] "27900" "175800" "139900" "110900" "399900" "124900" "211000" "915000"
[769] "166900" "327000" "610000" "1620000" "3089000" "2490000" "700000" "1195000"
[777] "1440000" "1545000" "1730000" "69800" "138900" "86500" "247000" "57000"
[785] "405000" "142900" "91900" "85900" "180380" "151287" "209500" "131990"
[793] "339000" "115900" "82900" "184980" "179900" "214000" "183400" "49900"
[801] "599000" "1490000" "348000" "1950000" "51500" "144800" "159300" "145800"
[809] "289500" "159700" "134900" "241400" "830000" "186070" "86600" "114900"
[817] "476000" "91000" "26500" "191990" "124990" "106150" "100760" "1395000"
[825] "1604320" "129981" "106900" "97900" "122845" "110935" "94900" "202000"
[833] "265500" "390250" "407000" "710000" "394000" "498000" "685000" "112500"
[841] "336000" "261000" "277000" "2500000" "2300000" "960000" "1090000" "166400"
[849] "107900" "78500" "131000" "141050" "100580" "73295" "94160" "125653"
[857] "118500" "357000" "698900" "975000" "760000" "58000" "255982" "95770"
[865] "201370" "134016" "101516" "108216" "139454" "103695" "102215" "105216"
[873] "89700" "100070" "140003" "148539" "146078" "126246" "142473" "145212"
[881] "134006" "136016" "55500" "79600" "71662" "319500" "712600" "379000"
[889] "79750" "73402" "80892" "74965" "76560" "78155" "81345" "52900"
[897] "273000" "2700000" "2350000" "213500" "83285" "112808" "86940" "118950"
[905] "150135" "48000" "1075000" "288000" "434203" "256050" "267075" "278700"
[913] "482893" "184500" "777777" "1111111" "177900" "1130000" "474000" "80900"
[921] "184900" "163510" "147318" "119404" "124572" "121575" "94500" "169149"
[929] "114629" "106754" "70067" "122348" "69008" "252595" "172044" "184375"
[937] "215763" "130921" "174699" "187500" "93636" "197701" "106400" "197975"
[945] "23500" "44000" "849000" "118800" "77292" "75190" "55600" "90931"
[953] "112750" "121900" "89100" "154500" "134700" "930000" "169100" "198300"
[961] "177800" "181700" "101200" "113454" "131500" "168500" "200485" "196500"
[969] "151500" "263000" "149800" "249600" "362300" "247100" "285400" "366000"
[977] "232500" "231200" "213000" "169990" "150500" "180800" "172500" "121500"
[985] "53000" "137800" "167424" "373000" "418000" "2200000" "800000" "472500"
[993] "328000" "310300" "327100" "999999" "542800" "564500" "312300" "547600"
[ reached getOption("max.print") -- omitted 1417 entries ]
#price alta variabilidad, dado que es una variable numérica,
#se podría transformar en categórica revisando más a fondo los valores
summary(as.integer(uniqueValues$price))
Min. 1st Qu. Median Mean 3rd Qu. Max.
6000 104106 158032 335764 296500 23456789
#Tipo de propiedad
uniqueValues$property_type
[1] "PH" "Casa" "Departamento"
#Las tres gategorías filtradas del dataset: PH, Casa, Departamento
#surface_total y surface_covered alta variabilidad, dado que es una variable numérica, #se podría transformar en categórica revisando más a fondo los valores
summary(as.integer(uniqueValues$surface_total))
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 145.2 279.5 749.6 450.8 126062.0 1
summary(as.integer(uniqueValues$surface_covered))
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
1.0 120.8 236.5 719.7 374.2 126062.0 1
#Price
uniqueValues$price
[1] "320000" "500000" "350000" "470000" "155000" "199900" "147000" "92294"
[9] "115000" "77000" "88900" "88798" "110975" "92943" "69000" "99000"
[17] "96984" "125000" "99500" "121600" "285000" "150000" "140000" "110000"
[25] "96000" "248200" "152732" "310000" "108000" "94800" "1080000" "88000"
[33] "225000" "85000" "129900" "185000" "158000" "98000" "80000" "159900"
[41] "87000" "178000" "172000" "208000" "83000" "134000" "135000" "105000"
[49] "165000" "90000" "104500" "119000" "192000" "390000" "169000" "86000"
[57] "79000" "315000" "230000" "120000" "259000" "180000" "269900" "190000"
[65] "175000" "130000" "209000" "270000" "340000" "194500" "255000" "139000"
[73] "439000" "729000" "128000" "101000" "92000" "245000" "214900" "200000"
[81] "385000" "370000" "220000" "189000" "239000" "260000" "252000" "290000"
[89] "790000" "595000" "280000" "195000" "613567" "215000" "210000" "580000"
[97] "224000" "850000" "680000" "1115000" "900000" "250000" "199000" "249000"
[105] "94000" "95000" "70000" "65000" "795000" "325000" "182000" "183000"
[113] "380000" "770000" "68000" "84000" "160000" "100700" "102000" "168000"
[121] "855000" "865000" "485000" "780000" "75000" "106000" "142000" "235000"
[129] "510000" "103000" "98500" "104000" "187300" "123900" "128400" "85600"
[137] "80400" "193000" "195200" "126400" "92400" "191200" "136500" "305000"
[145] "720000" "107000" "74000" "54900" "109000" "145000" "143000" "116000"
[153] "204000" "100000" "239800" "93000" "205000" "138000" "112000" "360000"
[161] "124000" "170000" "84102" "127000" "111000" "173000" "217000" "152000"
[169] "149000" "286000" "995000" "275000" "206700" "490000" "268000" "295000"
[177] "460000" "650000" "980000" "440000" "384000" "450000" "1700000" "397000"
[185] "550000" "620000" "184000" "1850000" "1650000" "375000" "540000" "1300000"
[193] "1970000" "1250000" "399000" "628000" "1150000" "1350000" "1600000" "297000"
[201] "278000" "332000" "133000" "479000" "365000" "355000" "240000" "649000"
[209] "420000" "236000" "264000" "300000" "749000" "84500" "69500" "97800"
[217] "415000" "73000" "82000" "132000" "138500" "419000" "1200000" "64900"
[225] "445000" "359500" "239500" "206000" "282000" "118000" "89000" "198000"
[233] "129000" "78000" "299000" "111900" "192900" "64000" "318000" "174000"
[241] "157354" "108286" "99900" "219000" "42000" "329000" "895000" "149500"
[249] "157800" "137000" "520000" "1100000" "144900" "398000" "124999" "157500"
[257] "117000" "378000" "191911" "289000" "37000" "345000" "518000" "83500"
[265] "395000" "430000" "269000" "49000" "114000" "99800" "114500" "90500"
[273] "128500" "241500" "62000" "203788" "211011" "194400" "45000" "95266"
[281] "85178" "93989" "95832" "88372" "93374" "83370" "92479" "87341"
[289] "98428" "91602" "98930" "90040" "90766" "91707" "99518" "84274"
[297] "94192" "89054" "101360" "86106" "89770" "97098" "102214" "100762"
[305] "94474" "97675" "85037" "90857" "92146" "86705" "95904" "97192"
[313] "100321" "85629" "83916" "104106" "81703" "95809" "96535" "94642"
[321] "141000" "159000" "55000" "103300" "89900" "135300" "284000" "97000"
[329] "170052" "162400" "330000" "123000" "84900" "368000" "314200" "410000"
[337] "144500" "177000" "302797" "276404" "288569" "277564" "97500" "244290"
[345] "162505" "248081" "203000" "67500" "88500" "212500" "159500" "79900"
[353] "670000" "298000" "572000" "404000" "126900" "239999" "1000000" "232000"
[361] "480000" "364000" "400000" "186000" "353000" "335000" "307000" "265000"
[369] "342000" "218000" "227000" "228000" "279900" "242000" "149600" "169900"
[377] "254000" "153000" "235500" "52000" "176000" "179000" "189900" "575000"
[385] "249999" "154900" "744000" "886000" "349000" "449900" "387000" "590000"
[393] "549000" "212000" "304500" "495000" "920000" "3500000" "515000" "750000"
[401] "3400000" "690000" "640000" "698000" "880000" "565000" "598000" "144000"
[409] "156000" "194900" "95500" "237000" "156960" "294000" "162000" "72000"
[417] "160125" "59900" "95200" "109500" "119500" "113000" "35000" "475000"
[425] "125900" "173583" "99100" "122500" "372000" "168900" "226000" "630000"
[433] "163000" "350550" "279000" "493000" "449000" "309000" "57900" "870000"
[441] "563000" "194000" "319000" "465000" "615000" "229000" "2550000" "3700000"
[449] "1480000" "940000" "860000" "359100" "448000" "343000" "719000" "444000"
[457] "828000" "455000" "267000" "524000" "148000" "435000" "2900000" "950000"
[465] "4500000" "675000" "530000" "560000" "47000" "425000" "93500" "58174"
[473] "75542" "98280" "95280" "730000" "187000" "570000" "122000" "157000"
[481] "145600" "149900" "825000" "1140000" "1400000" "1050000" "625000" "164000"
[489] "60000" "890000" "427696" "712000" "85720" "133849" "59000" "106800"
[497] "119700" "116800" "118600" "114800" "102700" "113800" "76000" "313000"
[505] "74500" "745900" "84501" "120150" "197860" "190250" "115700" "119900"
[513] "185080" "103320" "50000" "89999" "73900" "186600" "133500" "105100"
[521] "597000" "158900" "136422" "803648" "627880" "129294" "2169696" "69900"
[529] "127800" "60500" "64800" "105948" "99573" "148300" "106948" "133900"
[537] "156890" "76900" "164198" "248000" "635000" "229900" "164600" "147500"
[545] "199500" "202375" "233000" "229500" "367000" "158500" "158250" "217500"
[553] "177600" "279500" "166000" "39500" "37500" "216000" "289100" "244900"
[561] "4000000" "224900" "287000" "369000" "897000" "1500000" "1550000" "1800000"
[569] "1380000" "223000" "4800000" "2800000" "351000" "845000" "660000" "699000"
[577] "1680000" "188000" "87749" "257000" "189500" "202566" "198432" "210834"
[585] "215488" "214968" "45900" "166600" "90800" "81000" "2345678" "124500"
[593] "167000" "316484" "262000" "126825" "139500" "131100" "110600" "1234567"
[601] "121000" "74900" "56000" "169500" "134800" "109900" "61000" "80047"
[609] "77553" "77500" "89800" "346000" "1030000" "559000" "591361" "558571"
[617] "68500" "75500" "65500" "72500" "85500" "79500" "64500" "71000"
[625] "89500" "66500" "73500" "43000" "63000" "76500" "58500" "82500"
[633] "62500" "80500" "92500" "56500" "59500" "70500" "91650" "101050"
[641] "725000" "104436" "115788" "64212" "118463" "44592" "69258" "67241"
[649] "92818" "90804" "47294" "61993" "134725" "156188" "55517" "58237"
[657] "49135" "136000" "66000" "112400" "138600" "153500" "201500" "66900"
[665] "83800" "133575" "147275" "193500" "122900" "86130" "101039" "181140"
[673] "148200" "109800" "102573" "93600" "104948" "64950" "990000" "1390000"
[681] "1340000" "184300" "272000" "174900" "119990" "389000" "496000" "228542"
[689] "387090" "359000" "292000" "249500" "238000" "452000" "1360000" "1450000"
[697] "260850" "459000" "545000" "519000" "3000000" "1099000" "296000" "128900"
[705] "535000" "80711" "79120" "371000" "81900" "32000" "102526" "102481"
[713] "104311" "112587" "87500" "39000" "104900" "67000" "179500" "126000"
[721] "442650" "476700" "499800" "284997" "374000" "695000" "408400" "196000"
[729] "314000" "151000" "882700" "781820" "819650" "807040" "794430" "197000"
[737] "258000" "321000" "1750000" "338000" "910000" "318200" "738400" "875000"
[745] "555000" "423300" "91500" "998000" "428000" "243000" "154000" "389500"
[753] "68100" "108500" "129500" "73600" "108700" "79200" "60200" "106600"
[761] "27900" "175800" "139900" "110900" "399900" "124900" "211000" "915000"
[769] "166900" "327000" "610000" "1620000" "3089000" "2490000" "700000" "1195000"
[777] "1440000" "1545000" "1730000" "69800" "138900" "86500" "247000" "57000"
[785] "405000" "142900" "91900" "85900" "180380" "151287" "209500" "131990"
[793] "339000" "115900" "82900" "184980" "179900" "214000" "183400" "49900"
[801] "599000" "1490000" "348000" "1950000" "51500" "144800" "159300" "145800"
[809] "289500" "159700" "134900" "241400" "830000" "186070" "86600" "114900"
[817] "476000" "91000" "26500" "191990" "124990" "106150" "100760" "1395000"
[825] "1604320" "129981" "106900" "97900" "122845" "110935" "94900" "202000"
[833] "265500" "390250" "407000" "710000" "394000" "498000" "685000" "112500"
[841] "336000" "261000" "277000" "2500000" "2300000" "960000" "1090000" "166400"
[849] "107900" "78500" "131000" "141050" "100580" "73295" "94160" "125653"
[857] "118500" "357000" "698900" "975000" "760000" "58000" "255982" "95770"
[865] "201370" "134016" "101516" "108216" "139454" "103695" "102215" "105216"
[873] "89700" "100070" "140003" "148539" "146078" "126246" "142473" "145212"
[881] "134006" "136016" "55500" "79600" "71662" "319500" "712600" "379000"
[889] "79750" "73402" "80892" "74965" "76560" "78155" "81345" "52900"
[897] "273000" "2700000" "2350000" "213500" "83285" "112808" "86940" "118950"
[905] "150135" "48000" "1075000" "288000" "434203" "256050" "267075" "278700"
[913] "482893" "184500" "777777" "1111111" "177900" "1130000" "474000" "80900"
[921] "184900" "163510" "147318" "119404" "124572" "121575" "94500" "169149"
[929] "114629" "106754" "70067" "122348" "69008" "252595" "172044" "184375"
[937] "215763" "130921" "174699" "187500" "93636" "197701" "106400" "197975"
[945] "23500" "44000" "849000" "118800" "77292" "75190" "55600" "90931"
[953] "112750" "121900" "89100" "154500" "134700" "930000" "169100" "198300"
[961] "177800" "181700" "101200" "113454" "131500" "168500" "200485" "196500"
[969] "151500" "263000" "149800" "249600" "362300" "247100" "285400" "366000"
[977] "232500" "231200" "213000" "169990" "150500" "180800" "172500" "121500"
[985] "53000" "137800" "167424" "373000" "418000" "2200000" "800000" "472500"
[993] "328000" "310300" "327100" "999999" "542800" "564500" "312300" "547600"
[ reached getOption("max.print") -- omitted 1417 entries ]
#price alta variabilidad, dado que es una variable numérica, #se podría transformar en categórica revisando más a fondo los valores
summary(as.integer(uniqueValues$price))
Min. 1st Qu. Median Mean 3rd Qu. Max.
6000 104106 158032 335764 296500 23456789
#Tipo de propiedad
uniqueValues$property_type
[1] "PH" "Casa" "Departamento"
#Las tres gategorías filtradas del dataset: PH, Casa, Departamento
#a.Obtener la cantidad de valores unicos y de valores faltantes (NAs) para cada una de estas variables #Ahora vamos a totalizar los valores faltantes por columnap para esto usamos la función apply para sumar #sobre los resultados de la misma función para listar los is.na por columna #inceptionTime
apply(apply(ar_properties_filtrado,2,is.na),2,sum)
id l3 rooms bedrooms bathrooms
0 136 1975 9861 1211
surface_total surface_covered price property_type
1444 1174 0 0
#b.Obtener la matriz de correlacion para las variables numericas. #Para las correlaciones usaremos la librería corrr
library(tidyverse)
library(corrr)
library(corrplot)
corrplot 0.84 loaded
#Separaremos las variables numéricas
numeric_variables <- ar_properties_filtrado %>%
select(rooms, bedrooms, bathrooms, surface_total, surface_covered, price)
glimpse(numeric_variables)
Observations: 24,323
Variables: 6
$ rooms [3m[38;5;246m<fct>[39m[23m NA, 6, NA, 3, NA, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,...
$ bedrooms [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ bathrooms [3m[38;5;246m<fct>[39m[23m NA, 2, 2, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1...
$ surface_total [3m[38;5;246m<fct>[39m[23m 300, 178, 240, 157, 140, 95, 44, 40, 49, 40, 40, 40, 49, 40, 23, ...
$ surface_covered [3m[38;5;246m<fct>[39m[23m 180, 240, 157, NA, 110, 69, 38, 37, 44, 37, 37, 37, 44, 37, 23, 3...
$ price [3m[38;5;246m<fct>[39m[23m 320000, 500000, 350000, 470000, 155000, 199900, 147000, 92294, 11...
#Transformamos las variables categóricas en numéricas
numeric_variables <- apply(apply(numeric_variables,2,as.character), 2, as.numeric)
crm <- cor(numeric_variables, use="complete.obs", method="pearson")
crm
rooms bedrooms bathrooms surface_total surface_covered price
rooms 1.00000000 0.92979527 0.60318421 0.06364667 0.05561538 0.47595400
bedrooms 0.92979527 1.00000000 0.61396803 0.06507773 0.05684364 0.42622405
bathrooms 0.60318421 0.61396803 1.00000000 0.05664833 0.05071377 0.59618271
surface_total 0.06364667 0.06507773 0.05664833 1.00000000 0.98901389 0.04477758
surface_covered 0.05561538 0.05684364 0.05071377 0.98901389 1.00000000 0.04006500
price 0.47595400 0.42622405 0.59618271 0.04477758 0.04006500 1.00000000
#Generámos la matriz de corelaciones
library(RColorBrewer)
library(corrplot)
n <- ncol(crm)
p.mat<- matrix(NA, n, n)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(crm, method="color", col=col(200),
type="upper", order="hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, #Text label color and rotation
# Combine with significance
p.mat = p.mat, sig.level = 0.01, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=FALSE
)
#Encontramos que: #Todas las correlaciones entre las varialbes son positivas. #Contra la hipótesis inicial no se observa una correlaciónmarcada entre la superficie cubierta y el precio #Se observauna correlaciónentre el precio y el número de baños y cuartos
#3.Preparacion de los datos (II) #a.En el punto 2 deberian haber encontrado que la variable bedrooms presenta una alta #proporción de valores faltantes y que presenta una fuerte correlacion con la variable rooms. #Por lo tanto, vamos a eliminarla. #Nota: surface_total y surface_covered también están estrechamente correlacionadas
ar_properties_filtrado <- ar_properties_filtrado %>% select(-bedrooms)
#b.Eliminar todos los registros que presentan valores faltantes #Para esto usaremos la función drop_na de tidyr
#4.Analisis exploratorios (II) #a.Obtener estadisticas descriptivas para la variable precio (cuartiles, promedio, minimo y maximo) y #realizar un histograma de la variable
summaryPrecio <- summary(as.numeric(as.character(ar_properties_filtrado$price)))
summaryPrecio
Min. 1st Qu. Median Mean 3rd Qu. Max.
6000 117000 169000 250258 270000 6000000
priceHistogram <- qplot(as.numeric(as.character(ar_properties_filtrado$price)),
geom="histogram",
main = "Histograma de precios",
xlab = "Precio",
bins = 500)
priceHistogram
#Como vemos el histograma se ve algo distorsionado por los valores extremos que tiene #la variable price
#b.Obtener estadisticas descriptivas para la variable precio #(cuartiles, promedio, minimo y maximo) por cada tipo de propiedad.
summaryPrecioPorTipo <- summary(as.numeric(as.character(ar_properties_filtrado$price)))
summaryPrecioPorTipo <- tapply(as.numeric(as.character(ar_properties_filtrado$price)),
ar_properties_filtrado$property_type, summary)
summaryPrecioPorTipo <- summaryPrecioPorTipo[c("PH","Departamento","Casa")]
summaryPrecioPorTipo
$PH
Min. 1st Qu. Median Mean 3rd Qu. Max.
42000 134450 189000 215670 270000 1200000
$Departamento
Min. 1st Qu. Median Mean 3rd Qu. Max.
6000 114000 163000 246667 260762 6000000
$Casa
Min. 1st Qu. Median Mean 3rd Qu. Max.
62900 240000 335000 427946 499225 5000000
#c.Realizar un grafico de boxplot de la variable precio por tipo de propiedad
ar_properties_filtrado$price <- as.numeric(as.character(ar_properties_filtrado$price))
ggplot(ar_properties_filtrado, mapping = aes(x = property_type, y = price,
group = property_type, fill = property_type )) +
geom_boxplot()
#Una vez más los outliers no nos permiten ver muy bien la comparación entre los boxplots
#d.Realizar un correlagrama usando GGally
library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
Attaching package: 㤼㸱GGally㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
nasa
ggallyData <- ar_properties_filtrado %>% select(rooms, bathrooms, surface_total,
surface_covered, price, property_type)
ggallyData$rooms <- as.numeric(as.character(ggallyData$rooms))
ggallyData$bathrooms <- as.numeric(as.character(ggallyData$bathrooms))
ggallyData$surface_total <- as.numeric(as.character(ggallyData$surface_total))
ggallyData$surface_covered <- as.numeric(as.character(ggallyData$surface_covered))
ggallyData$property_type <- as.factor(as.character(ggallyData$property_type))
levels(ggallyData$property_type)
[1] "Casa" "Departamento" "PH"
#Departamento 17598 -414 #PH 1831 -414 #Casa 712 -414
ggpairs(ggallyData, mapping = aes(color = (ggallyData$property_type)))
plot: [1,1] [==--------------------------------------------------------------] 3% est: 0s
plot: [1,2] [====------------------------------------------------------------] 6% est:22s
plot: [1,3] [=====-----------------------------------------------------------] 8% est:19s
plot: [1,4] [=======---------------------------------------------------------] 11% est:21s
plot: [1,5] [=========-------------------------------------------------------] 14% est:18s
plot: [1,6] [===========-----------------------------------------------------] 17% est:17s
plot: [2,1] [============----------------------------------------------------] 19% est:18s
plot: [2,2] [==============--------------------------------------------------] 22% est:19s
plot: [2,3] [================------------------------------------------------] 25% est:19s
plot: [2,4] [==================----------------------------------------------] 28% est:17s
plot: [2,5] [====================--------------------------------------------] 31% est:16s
plot: [2,6] [=====================-------------------------------------------] 33% est:15s
plot: [3,1] [=======================-----------------------------------------] 36% est:15s
plot: [3,2] [=========================---------------------------------------] 39% est:15s
plot: [3,3] [===========================-------------------------------------] 42% est:14s
plot: [3,4] [============================------------------------------------] 44% est:14s
plot: [3,5] [==============================----------------------------------] 47% est:13s
plot: [3,6] [================================--------------------------------] 50% est:12s
plot: [4,1] [==================================------------------------------] 53% est:11s
plot: [4,2] [====================================----------------------------] 56% est:11s
plot: [4,3] [=====================================---------------------------] 58% est:10s
plot: [4,4] [=======================================-------------------------] 61% est:10s
plot: [4,5] [=========================================-----------------------] 64% est: 9s
plot: [4,6] [===========================================---------------------] 67% est: 8s
plot: [5,1] [============================================--------------------] 69% est: 7s
plot: [5,2] [==============================================------------------] 72% est: 7s
plot: [5,3] [================================================----------------] 75% est: 6s
plot: [5,4] [==================================================--------------] 78% est: 5s
plot: [5,5] [====================================================------------] 81% est: 5s
plot: [5,6] [=====================================================-----------] 83% est: 4s
plot: [6,1] [=======================================================---------] 86% est: 3s
plot: [6,2] [=========================================================-------] 89% est: 3s
plot: [6,3] [===========================================================-----] 92% est: 2s
plot: [6,4] [============================================================----] 94% est: 1s
plot: [6,5] [==============================================================--] 97% est: 1s
plot: [6,6] [================================================================]100% est: 0s
#5.Outliers #a.Eliminar los outliers de la variable precio con algún criterio que elijan. #Los mayores outliers de precio los encontramos en “Casa” y en “Departamento”. #Son outliers superiores en ambos casos, con lo que considero que lo más apropiado sería excluir #los valores que superen 3 distancias inter-cuartil sobre el 3 cuartil.
summaryPrecioPorTipo
$PH
Min. 1st Qu. Median Mean 3rd Qu. Max.
42000 134450 189000 215670 270000 1200000
$Departamento
Min. 1st Qu. Median Mean 3rd Qu. Max.
6000 114000 163000 246667 260762 6000000
$Casa
Min. 1st Qu. Median Mean 3rd Qu. Max.
62900 240000 335000 427946 499225 5000000
#Si usamos Casa para calcular las distnacias intercuartil
interqDistance <- as.numeric(summaryPrecioPorTipo$Casa[5] - summaryPrecioPorTipo$Casa[3])
interqDistance
[1] 164225
#Si usamos Departamento para calcular las distnacias intercuartil
interqDistance <- as.numeric(summaryPrecioPorTipo$Departamento[5] - summaryPrecioPorTipo$Departamento[3])
interqDistance
[1] 97762.5
#Usaremos Departamento, dado que el volumen de datos es muhco mayor para propiedades de este tipo. #Se realizaron experimentos también usando casa, pero el filtro no era tan efectivo como se esperaba.
filter <- as.numeric((interqDistance*3) + summaryPrecioPorTipo$Departamento[5])
filter
[1] 554050
ggallyDataNoOutliers <- ggallyData %>% filter(ggallyData$price <= filter)
#Con esto filtramos un outlier de superficie para un PH que dificulta ver las gráficas
ggallyDataNoOutliers <- ggallyDataNoOutliers %>% filter(ggallyDataNoOutliers$surface_covered < 11000)
summary(ggallyDataNoOutliers)
rooms bathrooms surface_total
Min. : 1.000 Min. : 1.00 Min. : 12.00
1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.: 44.00
Median : 3.000 Median : 1.00 Median : 62.00
Mean : 2.638 Mean : 1.38 Mean : 84.78
3rd Qu.: 3.000 3rd Qu.: 2.00 3rd Qu.: 94.00
Max. :12.000 Max. :14.00 Max. :16725.00
surface_covered price property_type
Min. : 3.00 Min. : 6000 Casa : 576
1st Qu.: 40.00 1st Qu.:114000 Departamento:16347
Median : 55.00 Median :160000 PH : 1800
Mean : 72.53 Mean :191495
3rd Qu.: 80.00 3rd Qu.:245000
Max. :10145.00 Max. :550000
#El filtro aplicado permite mucha más claridad en los boxplot de precio, además de mayor claridad #en los gráficos de rooms y de bathrooms, puesto que excluye valores de ouliers extremos que allí se tenían.
#Analisis exploratorios (III) #a.Obtener estadisticas descriptivas para la variable precio (cuartiles, promedio, minimo y maximo) y realizar un histograma de la variable
summaryPrecioFiltrado <- summary(as.numeric(as.character(ggallyDataNoOutliers$price)))
summaryPrecioFiltrado
Min. 1st Qu. Median Mean 3rd Qu. Max.
6000 114000 160000 191495 245000 550000
#El nuevo valormáxmo es $ 550.000es mucho menor que el valor que se tenía antes, sin embargo #es el resultado de expcluir los outliers extremos
priceHistogramFiltrado <- qplot(as.numeric(as.character(ggallyDataNoOutliers$price)),
geom="histogram",
main = "Histograma de precios",
xlab = "Precio",
bins = 500)
priceHistogramFiltrado
#El histograma de precios es más claro ahora,se mantiene eso si, siendo una distribucion con cola #a la derecha
#b.Obtener estadisticas descriptivas para la variable precio (cuartiles, promedio, minimo y maximo) por cada tipo de propiedad.
summaryPrecioPorTipoFiltrado <- summary(as.numeric(as.character(ggallyDataNoOutliers$price)))
summaryPrecioPorTipoFiltrado <- tapply(as.numeric(as.character(ggallyDataNoOutliers$price)),
ggallyDataNoOutliers$property_type, summary)
summaryPrecioPorTipoFiltrado <- summaryPrecioPorTipoFiltrado[c("PH","Departamento","Casa")]
summaryPrecioPorTipoFiltrado
$PH
Min. 1st Qu. Median Mean 3rd Qu. Max.
42000 133000 187250 207549 265000 550000
$Departamento
Min. 1st Qu. Median Mean 3rd Qu. Max.
6000 110000 155000 185661 234950 550000
$Casa
Min. 1st Qu. Median Mean 3rd Qu. Max.
62900 220000 290000 306892 395000 550000
#La media más alta sigue estando para los casos de las casas. Los PH y los Departamentos tienen #medias muy similares #c.Realizar un grafico de boxplot de la variable precio por tipo de propiedad
ggallyDataNoOutliers$price <- as.numeric(as.character(ggallyDataNoOutliers$price))
ggplot(ggallyDataNoOutliers, mapping = aes(x = property_type, y = price,
group = property_type, fill = property_type )) +
geom_boxplot()
#EL boxplot refleja valores superiores en general para el caso de las casas (puede darse debido a #su tamañoen metros cuadrados y a su mayor cantidad de ambientes) #d.Realizar un correlagrama usando GGAlly
ggpairs(ggallyDataNoOutliers, mapping = aes(color = (ggallyDataNoOutliers$property_type)))
plot: [1,1] [=--------------------------------------] 3% est: 0s
plot: [1,2] [==-------------------------------------] 6% est:19s
plot: [1,3] [===------------------------------------] 8% est:19s
plot: [1,4] [====-----------------------------------] 11% est:17s
plot: [1,5] [=====----------------------------------] 14% est:16s
plot: [1,6] [======---------------------------------] 17% est:15s
plot: [2,1] [========-------------------------------] 19% est:17s
plot: [2,2] [=========------------------------------] 22% est:18s
plot: [2,3] [==========-----------------------------] 25% est:17s
plot: [2,4] [===========----------------------------] 28% est:15s
plot: [2,5] [============---------------------------] 31% est:14s
plot: [2,6] [=============--------------------------] 33% est:13s
plot: [3,1] [==============-------------------------] 36% est:13s
plot: [3,2] [===============------------------------] 39% est:13s
plot: [3,3] [================-----------------------] 42% est:12s
plot: [3,4] [=================----------------------] 44% est:12s
plot: [3,5] [==================---------------------] 47% est:11s
plot: [3,6] [====================-------------------] 50% est:10s
plot: [4,1] [=====================------------------] 53% est:10s
plot: [4,2] [======================-----------------] 56% est: 9s
plot: [4,3] [=======================----------------] 58% est: 9s
plot: [4,4] [========================---------------] 61% est: 9s
plot: [4,5] [=========================--------------] 64% est: 8s
plot: [4,6] [==========================-------------] 67% est: 7s
plot: [5,1] [===========================------------] 69% est: 7s
plot: [5,2] [============================-----------] 72% est: 6s
plot: [5,3] [=============================----------] 75% est: 6s
plot: [5,4] [==============================---------] 78% est: 5s
plot: [5,5] [===============================--------] 81% est: 4s
plot: [5,6] [================================-------] 83% est: 4s
plot: [6,1] [==================================-----] 86% est: 3s
plot: [6,2] [===================================----] 89% est: 3s
plot: [6,3] [====================================---] 92% est: 2s
plot: [6,4] [=====================================--] 94% est: 1s
plot: [6,5] [======================================-] 97% est: 1s
plot: [6,6] [=======================================]100% est: 0s
#El correlograma nos muestra menos correlación enrte las variables precio y cantidad de baños, #curiosamente no se ve una correlacion entre la superficie y el precio, vemos el precio mas #relacionado con la cantidad de baños y con la cantidad de habitaciones
#7.Modelo lineal #a. Realizar un modelo lineal simple para explicar el precio en función de las habitaciones (rooms) #y otro modelo que explique el precio en función de la superficie total (surface_total)
#Añadimos las librerías necesarias:
library(modelr)
library(broom)
Attaching package: 㤼㸱broom㤼㸲
The following object is masked from 㤼㸱package:modelr㤼㸲:
bootstrap
#Iniciamos con el modelo para las habitaciones
modeloRooms <- lm(rooms ~ price, data = ggallyDataNoOutliers)
ggallyDataNoOutliers %>%
add_predictions(modeloRooms) %>%
ggplot(aes(price, pred)) +
geom_line() +
ggtitle(expression(beta[0] + beta[1]*x))
glance(modeloRooms)
ggallyDataNoOutliers %>%
add_residuals(modeloRooms) %>%
ggplot(aes(price, resid)) +
geom_hline(yintercept = 0, colour = "white", size = 3) +
geom_line() +
ggtitle(expression(+ epsilon))
#Los residuos no siguen un patrón definido que es lo que estábamos esperando #Ahora hacemos el modelo a partir de la superficie total
modeloSurface <- lm(surface_total ~ price, data = ggallyDataNoOutliers)
ggallyDataNoOutliers %>%
add_predictions(modeloSurface) %>%
ggplot(aes(price, pred)) +
geom_line() +
ggtitle(expression(beta[0] + beta[1]*x))
glance(modeloSurface)
ggallyDataNoOutliers %>%
add_residuals(modeloSurface) %>%
ggplot(aes(price, resid)) +
geom_hline(yintercept = 0, colour = "white", size = 3) +
geom_line() +
ggtitle(expression(+ epsilon))
#b. Usar la función summary() para obtener informacion de ambos modelos. Explicar los valores de los coeficientes estimados.
summary(modeloRooms)
Call:
lm(formula = rooms ~ price, data = ggallyDataNoOutliers)
Residuals:
Min 1Q Median 3Q Max
-4.0629 -0.7241 -0.0343 0.5977 8.5227
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.203e+00 1.456e-02 82.65 <2e-16 ***
price 7.494e-06 6.675e-08 112.28 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9533 on 18721 degrees of freedom
Multiple R-squared: 0.4024, Adjusted R-squared: 0.4024
F-statistic: 1.261e+04 on 1 and 18721 DF, p-value: < 2.2e-16
summary(modeloSurface)
Call:
lm(formula = surface_total ~ price, data = ggallyDataNoOutliers)
Residuals:
Min 1Q Median 3Q Max
-138.9 -22.7 -12.9 -0.6 16602.1
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.752e+01 3.522e+00 4.974 6.63e-07 ***
price 3.512e-04 1.615e-05 21.746 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 230.7 on 18721 degrees of freedom
Multiple R-squared: 0.02464, Adjusted R-squared: 0.02459
F-statistic: 472.9 on 1 and 18721 DF, p-value: < 2.2e-16
#c. ¿Cuál modelo usarían para predecir el precio? ¿Por qué? #Respecto a los resultados de la función Sumary preferiría quedarme con el modelo por #habitaciones, tiene un R-Square mucho más alto que el modelo de superficie, y un error estandrd #muchísimo menor.